1 Problematic

  1. 拜登正式就任美國第46任總統後,Twitter也將總統官方帳號POTUS轉交拜登團隊。與四年前交接不同的是,拜登並未接收先前川普的POTUS粉絲基礎,追蹤者將從零開始。多數美國媒體聲稱過去拜登的個人推特經營主要面向美國大眾推廣政策理念,相反的川普帳號只跟自己的支持者對話。如今,拜登的個人推特帳號主要扮演轉推POTUS推文的角色,而時下未見有拜登POTUS的詳細分析。拜登POTUS在公共關係經營上真的還是走傳統、建制路線嗎?本專案想要探討拜登POTUS是僅僅將twitter當作另一個宣傳政令跟政府講話的媒體平台,還是存在更簡結有力的政策想法以及情緒宣傳?

  2. 本專案預設tokenize去掉停詞之後取得的關鍵字對於推文含意的辨識度還是有權重差異的,有些特別具象化單詞會被推論為具有特定政策意涵或與特定政策事件相關。而其他較模糊的政治相關用語或一般字詞則被視為不具有明確政策資訊內涵。因此,本專案簡單假設如果POTUS只是一個單純宣傳政令或講話的帳號,應該大量含有上述辨識度高的具象化單字以指涉特定議題或事件。或者,在喜歡數或轉推數較高的推文中,要可以觀察到上述情形。

2 General Analysis

2.1 Preprocessing

library(dplyr)
library(tibble)
library(ggplot2)
library(stringr)
library(tidyr)
library(lubridate)
library(scales)
library(tidytext)
library(textdata)
library(cowplot)
#Load the dataset obtained from TwitteR package
load(file = "biden_tweets_df.rda")
load(file = "AllJoeBidenTweets.rda")
# Mutate source column and select other meaningful columns
df <- biden_tweets_df %>% 
    mutate(source = sapply(biden_tweets_df$statusSource , function(x) {
        if (str_detect(x, pattern = "Twitter Web App")) return("Twitter_Web_App")
        else if (str_detect(x, pattern = "Periscope")) return("Periscope")
        else if (str_detect(x, pattern = "Twitter Media Studio")) return("Twitter_Media_Studio")
        else if (str_detect(x, pattern = "Sprinklr")) return("Sprinklr")
        else return("The_White_House")
    })) %>% 
    select(id, source, text, created, retweetCount, favoriteCount) %>%
    mutate(hour = hour(with_tz(created, "EST")))

2.2 Collect tweet words

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- df %>%
    filter(!str_detect(text, '^"')) %>%
    mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
    unnest_tokens(word, text, token = "regex", pattern = reg) %>%
    filter(!word %in% stop_words$word,
           str_detect(word, "[a-z]")) %>%
    arrange(desc(favoriteCount))

2.3 Time line

2.3.1 Time line of retweetCount and FavoriteCount from Feb to May

ggplot(data = df) + 
    geom_col(mapping = aes(x = created, y = retweetCount, color = source, fill = source))

ggplot(data = df) + 
    geom_col(mapping = aes(x = created, y = favoriteCount, color = source, fill = source))

2.3.2 Tweeting in Hour of a Day

tweeting_hour <- df %>% 
    group_by(hour) %>%
    summarise(number=n(), 
              sum_favoriteCount = sum(favoriteCount), 
              sum_retweetCount = sum(retweetCount),
              mean_favoriteCount = mean(favoriteCount), 
              mean_retweetCount = mean(retweetCount)) %>%
    arrange(desc(number))
tweeting_hour_20 <- tweet_words %>%
    select(hour, word, retweetCount, favoriteCount) %>%
    inner_join(get_sentiments("nrc")) %>%
    arrange(desc(favoriteCount)) %>%
    top_n(20, favoriteCount)
p1 <- df %>% 
    count(hour = hour(with_tz(created, "EST"))) %>% 
    mutate(percent = n / sum(n)) %>%
    ggplot(mapping = aes(hour, percent)) + 
    geom_line() + 
    scale_y_continuous(labels = percent_format()) + 
    labs(x = "Hour of a day", y = "% of tweets")
p2 <- ggplot(tweeting_hour) + 
    geom_line(aes(hour, sum_favoriteCount), color="blue") +
    labs(x = "Hour of a day")
p3 <- ggplot(tweeting_hour) +
    geom_line(aes(hour, mean_favoriteCount), color="red") +
    labs(x = "Hour of a day")
p4 <- ggplot(tweeting_hour_20) +
    geom_point(aes(x = hour, y = word)) +
    labs(x = "Hour of a day")
plot_grid(p1, p2, p3, p4, ncol = 2)

3 Comparison of Words

3.1 Word Frequency Plot

tweet_words %>%
    count(word) %>%
    mutate(word = reorder(word, n)) %>%
    top_n(20, n) %>%
    ggplot() +
    geom_bar(aes(x = word, y = n), stat = "identity") +
    ylab("frequency of occurrence") +
    coord_flip()

3.2 Word Weighting of different sources (Video Streaming App and The White House)

# Word Weighting of different sources (Video Streaming App and The White House)
WhiteHouse_periscope_ratios <- tweet_words %>%
  count(word, source) %>%
  filter(sum(n) >= 5) %>%
  spread(source, n, fill = 0) %>%
  ungroup() %>%
  mutate_each(funs((. + 1) / sum(. + 1)), -word) %>%
  mutate(logratio = log2(The_White_House / Periscope)) %>%
  arrange(desc(logratio))
WhiteHouse_periscope_ratios %>%
  group_by(logratio > 0) %>%
  top_n(8, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_bar(stat = "identity", width = 0.5) +
  coord_fixed(ratio = 1000000) +
  coord_flip() +
  ylab("The White House / Periscope log ratio") +
  scale_fill_manual(name = "", labels = c("The White House", "Periscope"),
                    values = c("red", "lightblue"))

4 Sentiment Analysis

4.1 Tweets Sentiment Overview

sentiment_overview <- tweet_words %>%
    inner_join(get_sentiments("nrc")) %>%
    filter(!is.na(sentiment)) %>%
    count(sentiment, sort = TRUE)
sentiment_overview
## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 positive       761
##  2 trust          536
##  3 anticipation   418
##  4 negative       326
##  5 joy            259
##  6 fear           185
##  7 sadness        147
##  8 anger          133
##  9 surprise       131
## 10 disgust         64
ggplot(sentiment_overview) + 
  geom_bar(aes(x = reorder(sentiment, n), y = n), stat = "identity") +
  coord_flip()

4.2 Tweets Sentiment by Source

sources <- tweet_words %>%
    group_by(source) %>%
    mutate(total_words = n()) %>%
    ungroup() %>%
    distinct(id, source, total_words)
by_source_sentiment <- tweet_words %>%
    inner_join(get_sentiments("nrc"), by = "word") %>%
    count(sentiment, id) %>%
    ungroup() %>%
    complete(sentiment, id, fill = list(n = 0)) %>%
    inner_join(sources) %>%
    group_by(source, sentiment, total_words) %>%
    summarize(words = sum(n)) %>%
    arrange(desc(total_words)) %>%
    ungroup()
by_source_sentiment
## # A tibble: 50 x 4
##    source          sentiment    total_words words
##    <chr>           <chr>              <int> <dbl>
##  1 The_White_House anger               4442   123
##  2 The_White_House anticipation        4442   393
##  3 The_White_House disgust             4442    57
##  4 The_White_House fear                4442   171
##  5 The_White_House joy                 4442   247
##  6 The_White_House negative            4442   304
##  7 The_White_House positive            4442   692
##  8 The_White_House sadness             4442   136
##  9 The_White_House surprise            4442   123
## 10 The_White_House trust               4442   494
## # ... with 40 more rows

4.3 Usage of Common Sentimental Words

nrc_word_counts <- tweet_words %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
  
nrc_word_counts
## # A tibble: 960 x 3
##    word      sentiment        n
##    <chr>     <chr>        <int>
##  1 plan      anticipation    87
##  2 rescue    anticipation    52
##  3 rescue    joy             52
##  4 rescue    positive        52
##  5 rescue    surprise        52
##  6 rescue    trust           52
##  7 time      anticipation    47
##  8 president positive        39
##  9 president trust           39
## 10 nation    trust           30
## # ... with 950 more rows
nrc_word_counts %>% 
  group_by(sentiment) %>% 
  top_n(8, n) %>% 
  arrange(desc(n)) %>% 
  ungroup() %>%
  ggplot() +
  geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
  coord_flip() +
  facet_wrap(vars(sentiment), scales = "free") + 
  theme(axis.text.y = element_text(size = rel(0.75))) + 
  labs(x = "Word", y = "", title = "Frequency of Occurrence by Sentiment")

4.4 Usage of Common Sentimental Words with Top Fifty FavoriteCount

# Preprocessing
favorite <- df%>%
  arrange(desc(favoriteCount)) %>%
  top_n(50)
# Collecting Tweets Words
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
favorite_words <- favorite %>%
    filter(!str_detect(text, '^"')) %>%
    mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
    unnest_tokens(word, text, token = "regex", pattern = reg) %>%
    filter(!word %in% stop_words$word,
           str_detect(word, "[a-z]")) %>%
    arrange(desc(favoriteCount))
# View the difference between whole POTUS tweets and top 50 favorite tweets 
sentiment_overview_50 <- favorite_words %>%
    inner_join(get_sentiments("nrc")) %>%
    filter(!is.na(sentiment)) %>%
    count(sentiment, sort = TRUE)
sentiment_overview_50
## # A tibble: 10 x 2
##    sentiment        n
##    <chr>        <int>
##  1 positive        49
##  2 trust           45
##  3 negative        26
##  4 anticipation    21
##  5 fear            18
##  6 joy             16
##  7 anger           14
##  8 sadness         13
##  9 surprise         9
## 10 disgust          4
sentiment_overview_50 %>% 
  ggplot() + 
  geom_bar(aes(x = reorder(sentiment, n), y = n), stat = "identity") +
  coord_flip()

nrc_word_counts <- favorite_words %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
nrc_word_counts
## # A tibble: 187 x 3
##    word      sentiment        n
##    <chr>     <chr>        <int>
##  1 president positive         4
##  2 president trust            4
##  3 nation    trust            3
##  4 cancer    anger            2
##  5 cancer    disgust          2
##  6 cancer    fear             2
##  7 cancer    negative         2
##  8 cancer    sadness          2
##  9 defense   anger            2
## 10 defense   anticipation     2
## # ... with 177 more rows
nrc_word_counts %>% 
  filter(n > 1) %>%
  ggplot() +
  geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
  coord_flip() +
  facet_wrap(vars(sentiment), scales = "free") + 
  theme(axis.text.y = element_text(size = rel(0.75))) + 
  labs(x = "Word", y = "", title = "Words with Top Fifty FavoriteCount by Sentiment")

4.5 Compared with @JoeBiden

reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words2 <- JoeBidenTweets %>%
    filter(!str_detect(tweet, '^"')) %>%
    mutate(text = str_replace_all(tweet, "https://t.co/[A-Za-z\\d]+|&amp;", "")) %>%
    unnest_tokens(word, text, token = "regex", pattern = reg) %>%
    filter(!word %in% stop_words$word,
           str_detect(word, "[a-z]")) %>%
    arrange(desc(likes))
    
sentiment_counts <- tweet_words2 %>%
  left_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
sentiment_counts %>% 
  group_by(sentiment) %>% 
  top_n(8, n) %>% 
  arrange(desc(n)) %>% 
  ungroup() %>%
  ggplot() +
  geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
  coord_flip() +
  facet_wrap(vars(sentiment), scales = "free") + 
  theme(axis.text.y = element_text(size = rel(0.75))) + 
  labs(x = "Word", y = "", title = "Frequency of Occurrence by Sentiment")